Pull in mouse cell metadata and retain eye cells.
Output the mouse ocular barcodes.
library(tidyverse)
sample_meta <- data.table::fread('~/git/scEiaD_quant/sample_meta.scEiaD_v1.2025_02_03.02.tsv.gz')
cell_meta <- data.table::fread('~/data/scEiaD_modeling/mm111.adata.solo.20250131.obs.csv.gz')[,-1] %>%
relocate(barcode) %>%
filter(solo_doublet == "FALSE")
mm111_eye <- cell_meta %>%
filter(
organ == 'Eye',
organism == 'Mus musculus',
!grepl("^#", sample_accession),
source == 'Tissue')
#mm111_eye$barcode %>% write(gzfile('~/git/scEiaD_modeling/data/mm111_eye_bcs.20250221.csv.gz'))
sinteractive --mem=128G --time=8:00:00 --gres=gpu:a100:1
cd /data/OGVFB_BG/scEiaD/2024_02_28/snakeout/mm111_developing_eye
source /data/$USER/conda/etc/profile.d/conda.sh && source /data/$USER/conda/etc/profile.d/mamba.sh
mamba activate rapids_singlecell
bash ct_projection_call.sh
sceiad_meta <- fst::read_fst('~/data/scEiaD_2022_02/meta_filter.fst')
G3;fstcore package v0.9.18
gG3;(OpenMP was not detected, using single threaded mode)
g
ctp_mm111__human_modeling <- data.table::fread('/Users/mcgaugheyd/data/scEiaD_modeling/mm111_developing_eye/mm111.eye.human_CT_projections_20120128.csv.gz') %>% select(-17) %>% left_join(sceiad_meta %>% select(barcode = Barcode, CellType_predict), by = 'barcode') %>%
# only look at >= 10 day cells
mutate(age = as.numeric(age)) %>%
filter(age < 10)
G2;H2;Warningh: There was 1 warning in `mutate()`.
ℹ In argument: `age = as.numeric(age)`.
Caused by warning:
! NAs introduced by coerciong
label <- 'MajorCellType'
machine_label <- 'CT__sceiad_20250211_dev_mmGeneFilter'
most_common_mislabel <- ctp_mm111__human_modeling %>%
group_by(.data[[label]],.data[[machine_label]]) %>%
summarise(Count = n()) %>%
arrange(.data[[label]], -Count) %>%
slice_max(order_by = Count, n = 2) %>%
filter(.data[[label]] != .data[[machine_label]])
`summarise()` has grouped output by 'MajorCellType'. You can override using the `.groups` argument.
ctp_mm111__human_modeling %>%
mutate(tf = case_when(.data[[label]] == .data[[machine_label]] ~ TRUE,
TRUE ~ FALSE)) %>%
group_by(.data[[label]]) %>%
summarise(accuracy = sum(tf)/length(tf)) %>%
arrange(.data[[label]]) %>%
left_join(most_common_mislabel %>%
select(any_of(label),
`most common mislabel` = any_of(machine_label),
`mislabel count` = Count))
Joining with `by = join_by(MajorCellType)`
umap1 = 'umap1_sceiad_20250211_dev_mmGeneFilter'
umap2 = 'umap2_sceiad_20250211_dev_mmGeneFilter'
ct = 'CT__sceiad_20250211_dev_mmGeneFilter'
color = 'CT__sceiad_20250211_dev_mmGeneFilter'
score <- 'CT__sceiad_20250211_dev_mmGeneFilter__max_score'
umap_ct_plotter <- function(obj, umap1, umap2, ct, color, labels = TRUE){
plot <- obj %>%
mutate(leiden = as.factor(leiden)) %>%
ggplot(aes(x=.data[[umap1]],y=.data[[umap2]])) +
scattermore::geom_scattermore(aes(color = .data[[color]]), pointsize = 0.8, alpha = 0.5) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey(), pals::kelly(), pals::alphabet()) %>%
unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none")
if (labels){
plot <- plot + ggrepel::geom_label_repel(data = . %>%
group_by(.data[[ct]]) %>%
summarise(across(all_of(c(umap1,umap2)), median)),
aes(label = .data[[ct]], color = .data[[color]]))
}
print(plot)
}
umap_score_plotter <- function(obj, umap1, umap2, ct, score){
obj %>%
ggplot(aes(x=.data[[umap1]],y=.data[[umap2]])) +
scattermore::geom_scattermore(aes(color = .data[[score]]), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>%
group_by(.data[[ct]]) %>%
summarise(across(all_of(c(umap1,umap2)), median)),
aes(label = .data[[ct]]), max.overlaps = Inf) +
scale_color_viridis_c() +
cowplot::theme_cowplot() + theme(legend.position = "none")
}
umap_ct_plotter(ctp_mm111__human_modeling, umap1, umap2, ct, color)
umap_ct_plotter(ctp_mm111__human_modeling, umap1, umap2, 'leiden', 'leiden')
umap_score_plotter(ctp_mm111__human_modeling, umap1, umap2, ct, score)
Looks quite good - perhaps only pericyte and ciliary margin (neither of which are in the big model) are wrong?
ctp_mm111__human_modeling %>%
mcHelpeRs::sum_rat('CellType_predict', 'MajorCellType', 'CT__sceiad_20250211_dev_mmGeneFilter', threshold = 0.05) %>%
#filter(MajorCellType != '') %>%
DT::datatable()
As a crucial step is subsetting of each CT to prevent a very high n set (e.g. rods) to dominate
ctp_mm111__human_modeling <- ctp_mm111__human_modeling %>%
mutate(rough_ct = case_when(CellType_predict %in% c("Ciliary Margin", "Pericyte") ~ tolower(CellType_predict),
TRUE ~ CT__sceiad_20250211_dev_mmGeneFilter))
Pick ref barcodes to make scVI models
set.seed(20250303)
keep_ct <- ctp_mm111__human_modeling %>% group_by(rough_ct) %>% summarise(Count = n()) %>% filter(Count > 10) %>% pull(rough_ct) %>% unique() %>% sort()
mm111_dev_eye_ref_bcs <- ctp_mm111__human_modeling %>% ungroup() %>%
filter(rough_ct %in% keep_ct) %>%
group_by(study_accession, rough_ct) %>%
sample_n(1000, replace = TRUE) %>%
unique()
mm111_dev_eye_query_bcs <- ctp_mm111__human_modeling %>%
ungroup() %>%
filter(!barcode %in% mm111_dev_eye_ref_bcs$barcode)
#mm111_dev_eye_ref_bcs$barcode %>% write(gzfile('~/git/scEiaD_modeling/data/mm111_dev_eye_ref_bcs.20250303.csv.gz'))
#mm111_dev_eye_query_bcs$barcode %>% write(gzfile('~/git/scEiaD_modeling/data/mm111_dev_eye_query_bcs.20250303.csv.gz'))
cd /data/OGVFB_BG/scEiaD/2024_02_28/snakeout/mm111_developing_eye
sbatch --time=10:00:00 snakecall.sh
source('analysis_scripts.R')
obs <- pull_obs('~/data/scEiaD_modeling/mm111_developing_eye/mm111_dev_eye_20250303_2000hvg_50e_30l.obs.csv.gz', machine_label = 'scANVI_MCT', cluster_col = 'leiden5')
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
obs$obs <- obs$obs %>% left_join(ctp_mm111__human_modeling %>% select(barcode, CT__sceiad_20250211_dev_mmGeneFilter, CT__sceiad_20250211_dev_mmGeneFilter__max_score, umap1_sceiad_20250211_dev_mmGeneFilter,umap2_sceiad_20250211_dev_mmGeneFilter), by = c("barcodei" = 'barcode'))
obs$labels$mCT_scvi <- obs$obs %>% mcHelpeRs::sum_rat("leiden5","CT__sceiad_20250211_dev_mmGeneFilter") %>% slice_max(order_by = Count, n = 1) %>% pull(CT__sceiad_20250211_dev_mmGeneFilter)
obs$labels$mMCT_scvi <- obs$obs %>% mcHelpeRs::sum_rat("leiden5","CT__sceiad_20250211_dev_mmGeneFilter") %>% summarise(out = paste0(CT__sceiad_20250211_dev_mmGeneFilter, collapse = ', ')) %>% pull(out)
obs$obs %>%
left_join(obs$labels, by = 'leiden5') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = scANVI_MCT), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(scANVI_MCT) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = scANVI_MCT, color = scANVI_MCT)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
ggtitle("MajorCellType (scANVI)")
obs$obs %>%
left_join(obs$labels, by = 'leiden5') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = CT__sceiad_20250211_dev_mmGeneFilter), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(CT__sceiad_20250211_dev_mmGeneFilter) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = CT__sceiad_20250211_dev_mmGeneFilter, color = CT__sceiad_20250211_dev_mmGeneFilter)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
ggtitle("CT__sceiad_20250211_dev_mmGeneFilter")
obs$obs %>%
left_join(obs$labels, by = 'leiden5') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = CT__sceiad_20250211_dev_mmGeneFilter), pointsize = 0.8, alpha = 0.5) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
ggtitle("CT__sceiad_20250211_dev_mmGeneFilter") +
coord_cartesian(xlim = c(-20,20), ylim = c(-15,15)) +
facet_wrap(~study_accession)
obs$obs %>%
left_join(obs$labels, by = 'leiden5') %>%
mutate(age = as.integer(age)) %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = age), pointsize = 0.8, alpha = 0.5) +
# ggrepel::geom_label_repel(data = . %>% group_by(CT__sceiad_20250211_dev_mmGeneFilter) %>%
# summarise(umap1 = median(umap1),
# umap2 = median(umap2)),
# aes(label = CT__sceiad_20250211_dev_mmGeneFilter)) +
scale_color_viridis_c(option = 'H') +
cowplot::theme_cowplot() +
ggtitle("CT__sceiad_20250211_dev_mmGeneFilter") +
coord_cartesian(xlim = c(-20,20), ylim = c(-15,15))
obs$obs %>%
left_join(obs$labels, by = 'leiden5') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = as.factor(leiden5)), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_text_repel(data = . %>% group_by(mCT_scvi, leiden5) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2),),
aes(label = paste0(mCT_scvi,'-',leiden5),color = as.factor(leiden5)), bg.color = 'white') +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey(), pals::alphabet(), pals::kelly()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
ggtitle("leiden5")
pb <- data.table::fread('/Users/mcgaugheyd/data/scEiaD_modeling/mm111_developing_eye/mm111_dev_eye_20250303_2000hvg_50e_30l.pseudoBulk.leiden5.csv.gz')
colnames(pb) <- gsub("\\.\\d+","",colnames(pb))
hvg <- data.table::fread('/Users/mcgaugheyd/data/scEiaD_modeling/mm111_developing_eye/hvg2000.csv.gz')[-1,]
rnames <- pb$V1
clust <- str_extract(rnames, '\\d+') %>% as.integer()
pb <- pb[,-1] %>% as.matrix()
row.names(pb) <- as.character(clust)
pb <- pb[as.character(obs$labels$leiden5),]
pb_norm <- metamoRph::normalize_data(t(pb), sample_scale = 'cpm') %>% t()
G3;Sample CPM scaling
gG3;log1p scaling
g
pb_full <- pb_norm
pb_norm <- pb_norm[,hvg$V2]
#pb_norm <- pb_norm[,hvg$V2[!hvg$V2 %in% c(cc_genes,ribo_genes)]]
# https://stats.stackexchange.com/questions/31565/compute-a-cosine-dissimilarity-matrix-in-r
sim <- pb_norm / sqrt(rowSums(pb_norm * pb_norm))
sim <- sim %*% t(sim)
D_sim <- as.dist(1 - sim)
hclust_sim <- hclust(D_sim, method = 'average')
hclust_sim$labels <- obs$labels %>% pull(leiden5)
library(ggtree)
p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(obs$label, by = c("label" = "leiden5"))
p + layout_dendrogram() +
geom_tiplab(aes(label = paste(label, mCT_scvi, sep = ' - '), color = mCT_scvi)) +
theme_dendrogram(plot.margin=margin(16,16,300,16)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
guides(color="none")
diff <- pull_diff("~/data/scEiaD_modeling/mm111_developing_eye/mm111_dev_eye_20250303_2000hvg_50e_30l.difftesting.leiden5.csv.gz", orgdb = org.Mm.eg.db::org.Mm.eg.db, cluster_col = 'leiden5')
G3;
gG3;'select()' returned 1:many mapping between keys and columns
gG2;H2;Warningh in left_join(., conv_table, by = c("ENSEMBL")) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 13 of `x` matches multiple rows in `y`.
ℹ Row 22694 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
G2;H2;Warningh in left_join(., conv_table, by = c("ENSEMBL")) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 7 of `x` matches multiple rows in `y`.
ℹ Row 6 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
conv_table <- AnnotationDbi::select(org.Mm.eg.db::org.Mm.eg.db,
keys=gsub('\\.\\d+','',unique(diff$diff_testing$ENSEMBL)),
columns=c("ENSEMBL","SYMBOL", "GENENAME", "ENTREZID"), keytype="ENSEMBL")
G3;'select()' returned 1:many mapping between keys and columns
g
library(ComplexHeatmap)
hm_maker <- function(markers, target,
cdiff = diff,
clabels = labels,
remove = remove_leiden5){
tib <- cdiff$diff_testing %>%
left_join(clabels, by = c('base'='leiden5')) %>%
left_join(conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) %>%
filter(SYMBOL %in% markers) %>%
mutate(base = as.character(base),
base = paste0(base, ' - ', CT)) %>%
select(SYMBOL, base, logfoldchanges) %>%
pivot_wider(values_from = logfoldchanges, names_from = base)
mat <- tib %>% select(-1) %>% as.matrix()
row.names(mat) <- tib %>% pull(1)
ha_column = ComplexHeatmap::HeatmapAnnotation(df = data.frame(Target = ifelse(grepl(target, colnames(tib)[-1]), "Target","Not"),
Remove = ifelse(str_extract(colnames(tib)[-1], '\\d+') %in% remove, "Remove","Retain")),
col = list(Target = c("Target" = "black","Not" = "white"),
Remove = c("Remove" = "red", "Retain" = "white")))
col_fun = circlize::colorRamp2(c(-3, 0, 3), c("blue", "white", "red"))
draw(Heatmap(mat, col=col_fun,
name = 'logFoldChange',
top_annotation = ha_column)
)
}
mct = obs$obs %>% group_by(leiden5, CT__sceiad_20250211_dev_mmGeneFilter) %>% count() %>% ungroup() %>% group_by(leiden5) %>% slice_max(order_by = n, n = 1)
filter_mct = obs$obs %>% filter(CT__sceiad_20250211_dev_mmGeneFilter__max_score > 0.9) %>% group_by(leiden5, CT__sceiad_20250211_dev_mmGeneFilter) %>% count() %>% ungroup() %>% group_by(leiden5) %>% slice_max(order_by = n, n = 1)
mct %>% left_join(filter_mct, by = 'leiden5') %>% filter(CT__sceiad_20250211_dev_mmGeneFilter.x != CT__sceiad_20250211_dev_mmGeneFilter.y)
ct_map <- c(
'21' = 'cone (ml)',
'61' = 'periocular mesenchyme',
'62' = 'ciliary margin',
'65' = 'rod (precursor)',
'85' = 'lens',
'77' = 'astrocyte',
'13' = 'retinal ganglion',
'44' = 'retinal ganglion',
'51' = 'retinal ganglion',
'47' = 'horizontal',
'14' = 'neurogenic', # check above with score discrepancy
'87' = 'neurogenic', # check above with score discrepancy
'70' = 'bipolar (precursor)' ) # check above with score discrepancy
labels <- obs$labels %>%
mutate(CT = ifelse(as.character(leiden5) %in% names(ct_map), ct_map[as.character(leiden5)], mCT_scvi))
remove_leiden5 <- c(43,74) # odd umap and marker expression (mixed ct)
markers <- c("HES1",
"ZFP36L2",
# "HES6",
# "ATOH7",
"VIM",
"CCND1",
"SFRP2",
"SPP1",
"ZFP36L1",
"TF",
"FOS",
"TTYH1") %>% str_to_title()
mellough_markers <- read_csv("~/git/eyeMarkers/lists/rpc_markers__Mellough2019.csv")
Rows: 75 Columns: 2
── Column specification ──────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): HGNC, Cell Type
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
remove_leiden5 <- c()
markers <- mellough_markers %>% filter(`Cell Type` == 'RPC') %>% pull(HGNC) %>% str_to_title()
more <- c("PAX6","NEUROD1","ATOH7","HES6") %>% str_to_title()
hm_maker(c(markers, more), "rpc|neuro", clabels = obs$labels %>% mutate(CT = mCT))
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
markers <- c("LUM","DCN","VIM","PDGFRA","COL1A2", # https://www.nature.com/articles/s42003-020-0922-4
"MGP","MEG3","DCN","APOD","ANGPTL7","EFEMP1","BMP5","PRRX1")
markers <- #c("VIM","FAP","COL1A1","PDGFRB","S100A4", # fibro
c("PENK", "PITX2", #POM
"CDH5","VWF", # endo
"MYF5", "USP18", #connective tissue
"CDH1","KRT19","EPCAM", # epi
"KERA",# keratocyte)
"A2M")
hm_maker(markers %>% str_to_title(), "periocu|endo|epi|kera",clabels = labels)
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
markers <- conv_table %>% filter(grepl("^Cry", SYMBOL)) %>% pull(SYMBOL)
hm_maker(markers %>% str_to_title(), "lens",clabels = labels)
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
markers <- c("Rpe65","Mitf","Best1")
hm_maker(markers %>% str_to_title(), "rpe",clabels = labels)
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
markers <- c( "Gfap","Pax2")
hm_maker(markers %>% str_to_title(), "astroc",clabels = labels)
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
markers <- c( "Glul","Clu","Apoe")
hm_maker(markers %>% str_to_title(), "astroc",clabels = labels)
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
markers <- c("LHX1","ONECUT1") %>% str_to_title()
hm_maker(markers, "hori")
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
markers <- c('ARR3','OPN1LW','OPN1SW','RHO', 'OPN1MW', 'RCVRN',"CRX","PROM1","CNGA1","PDE6A") %>% str_to_title()
#markers <- mellough_markers %>% filter(`Cell Type` %in% c('Rod','Cone')) %>% pull(HGNC)
hm_maker(markers, "rod|cone")
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
markers <- c("GRIK1","IRX6","LRTM1","PCP2","PRKCA","TRPM1","VSX1","VSX2") %>% str_to_title()
#markers <- mellough_markers %>% filter(`Cell Type` == 'Bipolar') %>% pull(HGNC)
hm_maker(markers, "bipolar")
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
markers <- c('GAD1','GAD2','SLC6A9','NFIA') %>% str_to_title()
markers <- mellough_markers %>% filter(`Cell Type` == 'Amacrine') %>% pull(HGNC) %>% str_to_title()
hm_maker(markers, "amacr")
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
markers <- mellough_markers %>% filter(`Cell Type` == 'RGC') %>% pull(HGNC) %>% str_to_title()
hm_maker(markers, "ganglion")
Joining with `by = join_by(ENSEMBL)`
G2;H2;Warningh in left_join(., conv_table %>% select(SYMBOL, ENSEMBL) %>% unique()) :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 234 of `x` matches multiple rows in `y`.
ℹ Row 296 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.g
nobs <- obs$obs %>%
left_join(labels, by = 'leiden5') %>%
filter(!leiden5 %in% remove_leiden5)
# obs$obs %>% filter(leiden5 %in% c(14,57)) %>% mcHelpeRs::sum_rat(leiden5, CT__sceiad_20250211_dev_mmGeneFilter)
# # tweaking for cluster 14 and 57 as these don't seem to be clearly bipolar precursors
# nobs <- nobs %>% mutate(CT = case_when(leiden5 == 14 & CT__sceiad_20250211_dev_mmGeneFilter == 'bipolar (precursor)' ~ 'rod (precursor)',
# leiden5 == 14 & CT__sceiad_20250211_dev_mmGeneFilter == 'neurogenic' ~ 'neurogenic',
# leiden5 == 14 & CT__sceiad_20250211_dev_mmGeneFilter == 'rod (precursor)' ~ 'rod (precursor)',
# leiden5 == 14 & CT__sceiad_20250211_dev_mmGeneFilter == 'cone (precursor)' ~ 'rod (precursor)',
#
# leiden5 == 57 & CT__sceiad_20250211_dev_mmGeneFilter == 'bipolar (precursor)' ~ 'rod (precursor)',
# leiden5 == 57 & CT__sceiad_20250211_dev_mmGeneFilter == 'rod (precursor)' ~ 'rod (precursor)',
# leiden5 == 57 & CT__sceiad_20250211_dev_mmGeneFilter == 'neurogenic' ~ 'neurogenic',
# leiden5 == 57 & CT__sceiad_20250211_dev_mmGeneFilter == 'bipolar' ~ 'bipolar',
# leiden5 == 57 ~ 'neurogenic',
# TRUE ~ CT
# ))
nobs %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = CT), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(CT) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = CT, color = CT)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
ggtitle("CT")
Output updated CT calls, update the h5ad, and re-run scVI
set.seed(2025-03-05)
ref <- nobs %>%
filter(scANVI_MCT_max_score > 0.9) |>
group_by(study_accession, CT) %>%
slice_sample(n = 1000, replace = TRUE) %>%
unique()
query <- nobs %>% filter(!barcodei %in% ref$barcodei)
# ref$barcodei %>% write(gzfile('~/git/scEiaD_modeling/data/mm111_dev_eye_ref_bcs.20250305.stage3.csv.gz'))
# query$barcodei %>% write(gzfile('~/git/scEiaD_modeling/data/mm111_dev_eye_query_bcs.20250305.stage3.csv.gz'))
# #
# nobs %>% dplyr::rename(barcode = barcodei) %>% write_csv('~/git/scEiaD_modeling/data/Mouse_Developing_Eye__stage3_CTcalls.freeze20250305.01.csv.gz')
cd /data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_developing_eye/stage4
source /data/$USER/conda/etc/profile.d/conda.sh && source /data/$USER/conda/etc/profile.d/mamba.sh
mamba activate rapids_singlecell
python ~/git/scEiaD_modeling/workflow/scripts/append_obs.py ../../mm111.adata.solo.20250131.h5ad /home/mcgaugheyd/git/scEiaD_modeling/data/Mouse_Developing_Eye__stage3_CTcalls.freeze20250305.01.csv.gz mm111.adata.solo.20250305.dev.stage3.h5ad --transfer_columns CT
obs_s3 <- pull_obs('~/data/scEiaD_modeling/mm111_developing_eye/stage3/mm111_dev_eye_20250305_stage3_2000hvg_200e_30l.obs.csv.gz', label = 'CT', machine_label = 'scANVI_CT', cluster_col = 'leiden5')
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
obs_s3$obs %>% ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = CT), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(CT) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = CT, color = CT)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
ggtitle("CT")
obs_s3$obs %>%
left_join(obs_s3$labels, by = 'leiden5') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = as.factor(leiden5)), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_text_repel(data = . %>% group_by(mCT, leiden5) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2),),
aes(label = paste0(mCT,'-',leiden5),color = as.factor(leiden5)), bg.color = 'white') +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey(), pals::alphabet(), pals::kelly(), pals::brewer.set1(10), pals::okabe(), pals::watlington()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
ggtitle("leiden5")
machine_label = 'scANVI_CT'; label = 'CT'
obs_s3$obs %>%
#filter(scANVI_CT_max_score > 0.9) %>%
group_by(.data[[label]],.data[[machine_label]]) %>%
summarise(Count = n()) %>%
mutate(Ratio = Count/sum(Count)) %>%
ggplot(aes(x=.data[[label]],y=.data[[machine_label]],fill=Ratio, label = round(Ratio, 2))) +
geom_tile() +
shadowtext::geom_shadowtext() +
cowplot::theme_cowplot() +
scale_fill_viridis_c(begin = 0, end = 1) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
`summarise()` has grouped output by 'CT'. You can override using the `.groups` argument.
# retains CT calls >= 0.05 of a cluster. anything below gets changed to the dominant ct
nobs_s3_cleaning <- obs_s3$obs %>%
group_by(leiden5, scANVI_CT) %>%
summarise(Count = n()) %>%
mutate(Ratio = Count / sum(Count)) %>%
mutate(dominant_celltype = scANVI_CT[which.max(Count)]) %>%
mutate(CTc = case_when(Ratio < 0.05 ~ dominant_celltype,
TRUE ~ scANVI_CT))
`summarise()` has grouped output by 'leiden5'. You can override using the `.groups` argument.
nobs_s3 <- obs_s3$obs %>%
left_join(nobs_s3_cleaning %>%
dplyr::select(leiden5, scANVI_CT, CTc), by = c("leiden5","scANVI_CT"))
nobs_s3 <- nobs_s3 %>% select(-umap1, -umap2) %>% left_join(obs$obs %>% dplyr::select(umap1,umap2, barcodei), by = 'barcodei')
nobs_s3 %>% mcHelpeRs::sum_rat(CT, CTc,threshold = 0.01)
nobs_s3 %>% ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = CTc), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(CTc) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = CTc, color = CTc)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
ggtitle("CTc")
Plot of the ML cell type call confidence (1 is 100% confidence). We see that in several “precursor” positions in the UMAP the confidence is much lower.
nobs_s3 %>% ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = scANVI_CT_max_score), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(CTc) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = CTc), color = 'black') +
scale_color_viridis_c(option = 'rocket') +
cowplot::theme_cowplot() +
ggtitle("CTc")
sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.7.4
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] ComplexHeatmap_2.20.0 ggtree_3.12.0 fstcore_0.9.18 lubridate_1.9.3 forcats_1.0.0
[6] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
[11] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] RColorBrewer_1.1-3 shape_1.4.6.1 rstudioapi_0.16.0 jsonlite_1.8.8
[5] magrittr_2.0.3 magick_2.8.5 farver_2.1.2 rmarkdown_2.27
[9] GlobalOptions_0.1.2 fs_1.6.4 zlibbioc_1.50.0 vctrs_0.6.5
[13] Cairo_1.6-2 memoise_2.0.1 DelayedMatrixStats_1.26.0 htmltools_0.5.8.1
[17] S4Arrays_1.4.1 BiocNeighbors_1.22.0 SparseArray_1.4.8 gridGraphics_0.5-1
[21] sass_0.4.9 bslib_0.8.0 htmlwidgets_1.6.4 cachem_1.1.0
[25] igraph_2.0.3 iterators_1.0.14 lifecycle_1.0.4 pkgconfig_2.0.3
[29] rsvd_1.0.5 Matrix_1.7-0 R6_2.5.1 fastmap_1.2.0
[33] clue_0.3-65 GenomeInfoDbData_1.2.12 MatrixGenerics_1.16.0 digest_0.6.36
[37] aplot_0.2.3 colorspace_2.1-1 AnnotationDbi_1.66.0 patchwork_1.2.0
[41] S4Vectors_0.42.1 dqrng_0.4.1 irlba_2.3.5.1 RSQLite_2.3.7
[45] crosstalk_1.2.1 GenomicRanges_1.56.1 org.Hs.eg.db_3.19.1 beachmat_2.20.0
[49] labeling_0.4.3 org.Mm.eg.db_3.19.1 fansi_1.0.6 timechange_0.3.0
[53] httr_1.4.7 abind_1.4-5 compiler_4.4.1 doParallel_1.0.17
[57] bit64_4.0.5 withr_3.0.0 BiocParallel_1.38.0 DBI_1.2.3
[61] R.utils_2.12.3 maps_3.4.2 DelayedArray_0.30.1 rjson_0.2.21
[65] bluster_1.14.0 tools_4.4.1 ape_5.8 fst_0.9.8
[69] R.oo_1.26.0 mcHelpeRs_0.0.2 glue_1.7.0 nlme_3.1-165
[73] shadowtext_0.1.4 cluster_2.1.6 generics_0.1.3 gtable_0.3.5
[77] tzdb_0.4.0 R.methodsS3_1.8.2 data.table_1.16.99 hms_1.1.3
[81] BiocSingular_1.20.0 ScaledMatrix_1.12.0 metapod_1.12.0 utf8_1.2.4
[85] XVector_0.44.0 BiocGenerics_0.50.0 foreach_1.5.2 ggrepel_0.9.5
[89] pillar_1.9.0 vroom_1.6.5 yulab.utils_0.1.5 limma_3.60.4
[93] pals_1.9 circlize_0.4.16 treeio_1.28.0 lattice_0.22-6
[97] bit_4.0.5 tidyselect_1.2.1 SingleCellExperiment_1.26.0 locfit_1.5-9.10
[101] Biostrings_2.72.1 scuttle_1.14.0 knitr_1.48 IRanges_2.38.1
[105] edgeR_4.2.1 SummarizedExperiment_1.34.0 scattermore_1.2 stats4_4.4.1
[109] xfun_0.48 Biobase_2.64.0 statmod_1.5.0 matrixStats_1.3.0
[113] DT_0.33 stringi_1.8.4 UCSC.utils_1.0.0 lazyeval_0.2.2
[117] ggfun_0.1.5 yaml_2.3.10 evaluate_0.24.0 codetools_0.2-20
[121] ggplotify_0.1.2 cli_3.6.3 munsell_0.5.1 jquerylib_0.1.4
[125] dichromat_2.0-0.1 Rcpp_1.0.13 GenomeInfoDb_1.40.1 mapproj_1.2.11
[129] png_0.1-8 parallel_4.4.1 blob_1.2.4 scran_1.32.0
[133] sparseMatrixStats_1.16.0 viridisLite_0.4.2 tidytree_0.4.6 metamoRph_0.2.1
[137] ggridges_0.5.6 scales_1.3.0 crayon_1.5.3 GetoptLong_1.0.5
[141] rlang_1.1.4 KEGGREST_1.44.1 cowplot_1.1.3